{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Plotting data from SQLite database\n",
    "\n",
    "This notebook queries the databases to generate a matrix of plots for the number of articles/images per year, run separately for each category.\n",
    "\n",
    "This code reproduces the average number of images per article by year top-16 plot.\n",
    "\n",
    "For more on plots, see https://github.com/re-imaging/re-imaging/blob/master/sqlite-scripts/db_plots.ipynb\n",
    "\n",
    "## Structure\n",
    "\n",
    "- setup\n",
    "- load list of categories\n",
    "- pull specific data (and save as pickle)\n",
    "- format data\n",
    "- generate plot\n",
    "- save image\n",
    "\n",
    "Notebook is intended to be navigated and blocks to be run selectively, rather than the whole notebook being executed."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup\n",
    "\n",
    "Import required libraries, connect to SQLite database, create cursor, fetch table info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.cm as cm\n",
    "import seaborn as sns\n",
    "import numpy as np\n",
    "import sqlite3\n",
    "import pickle\n",
    "import copy\n",
    "import json\n",
    "import math\n",
    "import pandas as pd\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import the sqlite3 database and create a cursor\n",
    "db_path = os.path.expanduser(\"~/data/db/arxiv_db_images.sqlite3\")\n",
    "db = sqlite3.connect(db_path)\n",
    "c = db.cursor()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "c.execute('PRAGMA TABLE_INFO({})'.format(\"metadata\"))\n",
    "info = c.fetchall()\n",
    "\n",
    "print(\"\\nColumn Info:\\nID, Name, Type, NotNull, DefaultVal, PrimaryKey\")\n",
    "for col in info:\n",
    "    print(col)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Build category lists\n",
    "\n",
    "First get a full list of all the primary categories by querying the SQLite database. Used to do later queries. Select a sort_mode first."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# `catlist` refers to categories by article\n",
    "\n",
    "print(\"pulling all categories by total number of articles\")\n",
    "c.execute('''\n",
    "    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1), count(metadata.identifier)\n",
    "    FROM metadata\n",
    "    GROUP BY substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1)\n",
    "    ORDER BY count(metadata.identifier) DESC    \n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "articles_catlist = rows\n",
    "# uncomment if you want to use article count to order categories\n",
    "# catlist = rows\n",
    "\n",
    "print(\"pulling all categories by total number of images\")\n",
    "c.execute('''\n",
    "    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1), count(images.identifier)\n",
    "    FROM images\n",
    "    LEFT JOIN metadata ON images.identifier = metadata.identifier \n",
    "    GROUP BY substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1)\n",
    "    ORDER BY count(images.identifier) DESC    \n",
    "''')   \n",
    "rows = c.fetchall()\n",
    "images_catlist = rows\n",
    "\n",
    "print(\"pulling all categories in alphabetical order\")\n",
    "c.execute('''\n",
    "    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1), count(metadata.identifier)\n",
    "    FROM metadata\n",
    "    WHERE strftime(\"%Y\", metadata.created) != '2019'\n",
    "    AND strftime(\"%Y\", metadata.created) != '2020'\n",
    "    GROUP BY substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1)\n",
    "    ORDER BY substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) ASC\n",
    "''')\n",
    "rows = c.fetchall()\n",
    "alpha_catlist = rows\n",
    "catlist = rows\n",
    "print(\"done\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f'The list of categories totals {len(catlist)}. Here are the first entries:')\n",
    "print(catlist[:8])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Finding the change in rank between number of articles and number of images"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# structure\n",
    "# [category, article-rank, images-rank, rank-difference]\n",
    "\n",
    "ordering = [[cat[0], count, 0, 0] for count, cat in enumerate(articles_catlist)]\n",
    "print(ordering)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# find the difference in category list when ordering by article numbers vs image numbers\n",
    "acount = 0\n",
    "for ac in articles_catlist:\n",
    "#     print(\"articles:\",acount)\n",
    "    icount = 0\n",
    "    for ic in images_catlist:\n",
    "        if ac[0] == ic[0]:\n",
    "#             print(\"match:\",ac[0],ic[0])\n",
    "            ordering[acount][2] = icount\n",
    "        icount += 1 \n",
    "    acount += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# go through and modify the rank-difference\n",
    "for count, row in enumerate(ordering):\n",
    "    ordering[count][3] = row[1] - row[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# [category, article-rank, images-rank, change]\n",
    "for cat in ordering:\n",
    "    print(cat)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generating Plots\n",
    "\n",
    "### Article data\n",
    "\n",
    "Then use that list of primary categories to query the database for how many articles per year. Store it in the `articles_data` variable."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# by category\n",
    "# number of articles for each year\n",
    "\n",
    "sql = ('''\n",
    "    SELECT count(metadata.identifier), strftime(\"%Y\", metadata.created) as 'Y'\n",
    "    FROM metadata\n",
    "    WHERE substr(trim(cat),1,instr(trim(cat)||' ',' ')-1) = ?\n",
    "    AND strftime(\"%Y\", metadata.created) != '2019'\n",
    "    AND strftime(\"%Y\", metadata.created) != '2020'\n",
    "    GROUP BY strftime(\"%Y\", metadata.created)\n",
    "    ORDER BY strftime(\"%Y\", metadata.created) ASC\n",
    "    ''')\n",
    "\n",
    "data = []\n",
    "\n",
    "for cat in catlist:\n",
    "    print(\"querying for category: \" + str(cat[0]))\n",
    "    c.execute(sql, (cat[0], ))\n",
    "    rows = c.fetchall()\n",
    "    print(rows)\n",
    "\n",
    "    years = []\n",
    "    totals = []\n",
    "    \n",
    "    for row in rows:\n",
    "        years.append(row[1])\n",
    "        totals.append(row[0])\n",
    "        \n",
    "    newdata = [cat[0], years, totals]\n",
    "    data.append(newdata)\n",
    "\n",
    "print(\"*\" * 20)\n",
    "print(\"done\")\n",
    "\n",
    "article_data = data\n",
    "\n",
    "# WRITE PKL\n",
    "\n",
    "filename = \"articles_cat_year_data.pkl\"\n",
    "with open(filename, \"wb\") as write_file:\n",
    "    pickle.dump(article_data, write_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "print(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# READ PKL\n",
    "\n",
    "filename = \"articles_cat_year_data.pkl\"\n",
    "with open(filename, \"rb\") as read_file:\n",
    "    article_data = pickle.load(read_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f'length of article_data: {len(article_data)}')\n",
    "print(article_data[:])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Image data\n",
    "\n",
    "Pull number of images in each year. Store in `images_data`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# by category\n",
    "# total number of images for each year\n",
    "\n",
    "image_pkl_filename = \"images_cat_year.pkl\"\n",
    "\n",
    "sql = ('''\n",
    "    SELECT count(images.identifier), strftime(\"%Y\", metadata.created) as 'Y'\n",
    "    FROM images\n",
    "    LEFT JOIN metadata on images.identifier = metadata.identifier\n",
    "    WHERE substr(trim(cat),1,instr(trim(cat)||' ',' ')-1) = ?\n",
    "    AND strftime(\"%Y\", metadata.created) != '2019'\n",
    "    AND strftime(\"%Y\", metadata.created) != '2020'\n",
    "    GROUP BY strftime(\"%Y\", metadata.created)\n",
    "    ORDER BY strftime(\"%Y\", metadata.created) ASC\n",
    "    ''')\n",
    "\n",
    "data = []\n",
    "\n",
    "for cat in catlist:\n",
    "    print(\"querying for category: \" + str(cat[0]))\n",
    "    c.execute(sql, (cat[0], ))\n",
    "    rows = c.fetchall()\n",
    "    print(rows)\n",
    "    \n",
    "    years = []\n",
    "    totals = []\n",
    "    \n",
    "    for row in rows:\n",
    "        years.append(row[1])\n",
    "        totals.append(row[0])\n",
    "        \n",
    "    newdata = [cat[0], years, totals]\n",
    "    data.append(newdata)\n",
    "\n",
    "print(\"*\" * 20)\n",
    "print(\"done\")\n",
    "\n",
    "image_data = data\n",
    "\n",
    "# WRITE\n",
    "image_pkl_filename = \"images_cat_year_data.pkl\"\n",
    "with open(image_pkl_filename, \"wb\") as write_file:\n",
    "    pickle.dump(image_data, write_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(len(image_data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# READ PKL\n",
    "image_pkl_filename = \"images_cat_year_data.pkl\"\n",
    "with open(image_pkl_filename, \"rb\") as read_file:\n",
    "    image_data = pickle.load(read_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f'length of image_data: {len(image_data)}')\n",
    "print(image_data[:10])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Get percentages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# this code copies the article data then modifies it by \n",
    "# dividing the total number of images by the number of articles\n",
    "\n",
    "average_data = copy.deepcopy(article_data)\n",
    "\n",
    "for count, article in enumerate(average_data):\n",
    "    \n",
    "    # set all values to zero\n",
    "    for index, val in enumerate(average_data[count][2]):\n",
    "        average_data[count][2][index] = 0\n",
    "\n",
    "#     average_data[count][2]\n",
    "    for i, year in enumerate(image_data[count][1]):\n",
    "        try:\n",
    "            print(\"*\" * 20)\n",
    "            print(f'count {count} | year {year} | i {i}')\n",
    "            print(article_data[count][1])\n",
    "            listindex = article_data[count][1].index(str(year)) \n",
    "            print(\"listindex:\",listindex)\n",
    "            print(\"count:\",count)\n",
    "            print(\"i:\",i)\n",
    "            print(\"no images:\",image_data[count][2][i])\n",
    "            print(\"no articles:\",article_data[count][2][listindex])\n",
    "            print(\"average:\",image_data[count][2][i] / article_data[count][2][listindex])\n",
    "            average_data[count][2][listindex] = image_data[count][2][i] / article_data[count][2][listindex]\n",
    "        except ValueError:\n",
    "            print(\"!\" * 20)\n",
    "            print(\"didn't find index\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(average_data[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# copy data over for plotting\n",
    "data = copy.deepcopy(average_data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Clean data\n",
    "- remove any entries of \"2019\" in the years and articles columns of data (don't have full data for this year).\n",
    "- rewrite all entries as integers rather than strings (otherwise there will be problems when adjusting the axes)\n",
    "- find the minimum and maximum for any entries, so that we can set our axes later as needed.\n",
    "\n",
    "Data is saved as nested lists in the format\n",
    "```\n",
    "[\n",
    "    [cat1, [year1, year2...yearX], [totat1, total2...totalY]\n",
    "    [cat2, [year1, year2...yearX], [totat1, total2...totalY]\n",
    "    ...\n",
    "    [catZ, [year1, year2...yearX], [totat1, total2...totalY]\n",
    "]\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Clean year entries - make sure to run this!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# fix year entries appearing as string by going through each and re-writing as integer\n",
    "\n",
    "for cat in data:\n",
    "    for k in enumerate(cat[1]):\n",
    "#         print(cat[1][k[0]])\n",
    "        cat[1][k[0]] = int(cat[1][k[0]])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Remove years that aren't being used in plots"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get the index and remove from both the year and no. article lists\n",
    "\n",
    "temp_data = data\n",
    "\n",
    "# list of years to remove\n",
    "years_to_remove = [2020, 2019]\n",
    "\n",
    "for i, data_row in enumerate(data):\n",
    "    num_years = len(data_row[1])\n",
    "    for ii, data_year in enumerate(data_row[1][::-1]):\n",
    "#         print(f'data_year: {data_year}, format: {type(data_year)}')\n",
    "        for y in years_to_remove:\n",
    "#             print(f'y: {y}, format: {type(y)}')\n",
    "            if data_year == y:\n",
    "                index = num_years - ii - 1\n",
    "                print(f'found entry, i: {i} | ii: {ii} | data_year: {data_year} | index {index}')\n",
    "                del(temp_data[i][2][index])\n",
    "                del(temp_data[i][1][index])\n",
    "                print(\"*\" * 20)\n",
    "                \n",
    "data[:] = temp_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# test to make sure there is still a total for each year\n",
    "\n",
    "for cat in data:\n",
    "    if len(cat[1]) != len(cat[2]):\n",
    "        print(\"problem with category: \" + cat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "count = 0\n",
    "for d in data:\n",
    "    for y in d[1]:\n",
    "        if y == 2020 or y == 2019:\n",
    "            count += 1\n",
    "            print(f'found year {y}')\n",
    "print(\"-\" * 20)\n",
    "print(f'found a total of {count} entries with year 2019 or 2020')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Save data \n",
    "Interim progress, to prevent having to run SQL queries again : )\n",
    "Save as either json file or pickle for reloading.\n",
    "First set filename"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save_mode = \"json\"\n",
    "save_mode = \"pkl\"\n",
    "\n",
    "# filename = \"articles_cat_year.\" + save_mode \n",
    "# filename = \"images_cat_year.\" + save_mode \n",
    "filename = \"average_images_article_cat_year.\" + save_mode\n",
    "\n",
    "print(filename)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### JSON\n",
    "saves as human-readable JSON format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# WRITE\n",
    "\n",
    "# with open(filename, \"w\") as write_file:\n",
    "#     json.dump(data, write_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# READ\n",
    "if save_mode == \"json\":\n",
    "    load_data = []\n",
    "\n",
    "    with open(filename, \"r\") as read_file:\n",
    "        load_data = json.load(read_file)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### pickle\n",
    "Save data as a serialized file using pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#READ\n",
    "\n",
    "if save_mode == \"pkl\":\n",
    "    load_data = []\n",
    "\n",
    "    with open(filename, \"rb\") as read_file:\n",
    "        load_data = pickle.load(read_file)\n",
    "        read_file.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# WRITE\n",
    "# with open(filename, \"wb\") as write_file:\n",
    "#     pickle.dump(data, write_file)\n",
    "#     write_file.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Check data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = load_data\n",
    "print(load_data[:10])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate plots"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load in the imported data in the `data` variable"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for row in data[:10]:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load data\n",
    "\n",
    "#### For average number of images per article by year top-16 plot"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Just get the first 16 categories from the catlist, e.g. if generating top-16 by image count."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = []\n",
    "selected_cats = []\n",
    "for c in images_catlist[:16]:\n",
    "    selected_cats.append(c[0])\n",
    "for c in selected_cats:\n",
    "    for d in load_data:\n",
    "        if c == d[0]:\n",
    "            data.append(d)\n",
    "print(len(selected_cats))\n",
    "print(selected_cats)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# full names for giving plot titles\n",
    "selected_cats_full = [\"Computer Science: Computer Vision\",\n",
    "                      \"High Energy Physics - Phenomenology\",\n",
    "                      \"Astrophysics\",\n",
    "                      \"Astrophysics of Galaxies\",\n",
    "                      \"Computer Science: Machine Learning\",\n",
    "                      \"Solar and Stellar Astrophysics\",\n",
    "                      \"Cosmology and Nongalactic Astrophysics\",\n",
    "                      \"Quantum Physics\",\n",
    "                      \"High Energy Physics - Theory\",\n",
    "                      \"High Energy Astrophysical Phenomena\",\n",
    "                      \"Mesoscale and Nanoscale Physics\",\n",
    "                      \"Strongly Correlated Electrons\",\n",
    "                      \"Mathematics: Numerical Analysis\",\n",
    "                      \"High Energy Physics - Experiment\",\n",
    "                      \"General Relativity\", #  and Quantum Cosmology\n",
    "                      \"Condensed Matter: Statistical Mechanics\"\n",
    "                     ]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Or select specific categories to plot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# selected_cats = [\"hep-ph\", \"astro-ph\", \"cs.CV\", \"astro-ph.GA\", \"astro-ph.CO\", \"astro-ph.SR\",\n",
    "#                 \"quant-ph\", \"hep-th\", \"astro-ph.HE\", \"cond-mat.mes-hall\", \"cond-mat.str-el\",\n",
    "#                 \"hep-ex\", \"cond-mat.stat-mech\", \"nucl-th\", \"gr-qc\", \"cs.LG\"]\n",
    "\n",
    "# data = []\n",
    "# for c in selected_cats:\n",
    "#     for d in load_data:\n",
    "#         if c == d[0]:\n",
    "#             data.append(d)\n",
    "# print(len(data))\n",
    "# print(selected_cats)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "for d in data:\n",
    "    print(d)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Find max and min\n",
    "\n",
    "Go through each value in the data to find the maximum and minimums for plotting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get the maximums and minimums of year and no. articles for figuring out axes\n",
    "minY = math.inf\n",
    "maxY = -(math.inf)\n",
    "minA = math.inf\n",
    "maxA = -(math.inf)\n",
    "\n",
    "for cat in data:\n",
    "#     print(len(cat))\n",
    "    print(cat[0])\n",
    "    if min(cat[1]) < minY: minY = min(cat[1])\n",
    "    if max(cat[1]) > maxY: maxY = max(cat[1])\n",
    "    if min(cat[2]) < minA: minA = min(cat[2])\n",
    "    if max(cat[2]) > maxA: maxA = max(cat[2])\n",
    "    print(\"min year: \" + str(min(cat[1])))\n",
    "    print(\"max year: \" + str(max(cat[1])))\n",
    "    print(\"min articles/images: \" + str(min(cat[2])))\n",
    "    print(\"max articles/images: \" + str(max(cat[2])))\n",
    "    print(\"*\" * 20)\n",
    "    \n",
    "print(\"minY: \" + str(minY))\n",
    "print(\"maxY: \" + str(maxY))\n",
    "print(\"minA: \" + str(minA))\n",
    "print(\"maxA: \" + str(maxA))\n",
    "              \n",
    "print(\"done\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Save data in org format\n",
    "Use org-friendly table format. This can be printed to console or written to a file. For posting to Github and rendered in Github markdown."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# filename = \"stats_images_cat_year.org\"\n",
    "filename = \"stats_average_images_article_cat_year.org\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# write the data in an org-friendly format for posting on github\n",
    "# with nested lists\n",
    "\n",
    "with open(filename, \"w\") as write_file:\n",
    "    for cat in data:\n",
    "        print(\"* \" + cat[0], file=write_file)\n",
    "        joined = list(zip(cat[1], cat[2]))\n",
    "        #     print(joined)\n",
    "        print(\"|-|-|\", file=write_file)\n",
    "        for j in joined:\n",
    "            print('|' + str(j[0]) + \"|\" + str(j[1]) + \"|\", file=write_file)\n",
    "        print(\"|-|-|\", file=write_file)\n",
    "write_file.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Plotting matrix of scatterplots\n",
    "\n",
    "Plot data in two formats\n",
    "- with shared x and y axes, for comparison across data\n",
    "- with individual x and y axes taken from min/max of each plot automatically, for individual trends\n",
    "- finally, save as high resolution (300 dpi) image"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bArticles = True\n",
    "# bArticles = False\n",
    "bAverage = True\n",
    "# bAverage = False\n",
    "bLog10 = False\n",
    "# bLog10 = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# manually set the maximum for the Y-axis to ignore large outliers\n",
    "maxA = 32"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Plot for \"Images of the arXiv\" paper\n",
    "\n",
    "#### Average number of images per article by year in each category.\n",
    "\n",
    "Number of images published per year in each category."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# plot figures with shared x and y axes \n",
    "# using the min/max year/article numbers from the cleaning step\n",
    "\n",
    "xdim = 4\n",
    "ydim = 4\n",
    "\n",
    "bLog10 = False\n",
    "bArticles = False\n",
    "\n",
    "fig, ax = plt.subplots(ydim, xdim, sharey='row') # sharex='col', \n",
    "fig.subplots_adjust(hspace=0.5, wspace=0.3)\n",
    "fig.set_size_inches(16, 12)\n",
    "\n",
    "# if bArticles: fig.suptitle(\"arXiv total articles per year between 1991 and 2018\\nShared Axes\", x=0.5, y=0.92, size=28)\n",
    "# else: fig.suptitle(\"arXiv total images per year between 1991 and 2018\\nShared Axes\", x=0.5, y=0.92, size=28)\n",
    "    \n",
    "data_size = len(data)\n",
    "\n",
    "for i in range(ydim):\n",
    "    for j in range(xdim):\n",
    "        idx = (i * xdim) + j\n",
    "        if idx < data_size:\n",
    "            if bArticles:\n",
    "                ax[i, j].plot(data[idx][1], data[idx][2], '--.')\n",
    "            else:\n",
    "                ax[i, j].plot(data[idx][1], data[idx][2], '--k.')\n",
    "#             ax[i, j].title.set_text(data[idx][0])\n",
    "            title_string = f'{selected_cats_full[idx]}\\ntotal: {(images_catlist[idx][1])}'\n",
    "#             title_string = f'{selected_cats_full[idx]}'\n",
    "            ax[i, j].title.set_text(title_string)\n",
    "#             s = f'total: {(catlist[idx][1])}'\n",
    "#             ax[i, j].text(0.025, 0.88, s, fontsize=12, transform=ax[i, j].transAxes)\n",
    "            ax[i, j].axis([1991, 2018, 0, maxA])\n",
    "            if bLog10: ax[i, j].set_yscale('log')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig.savefig(\"plot_images_cat_year_indax_shareY_top16_v4.svg\", dpi=300, bbox_inches='tight',\n",
    "    pad_inches=0, transparent=False)\n",
    "fig.savefig(\"plot_images_cat_year_indax_shareY_top16_v4.png\", dpi=300, bbox_inches='tight',\n",
    "    pad_inches=0 )"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
